In [52]:
import pandas as pd
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
import re
from collections import Counter

In [8]:
reviews = pd.read_csv('Reviews.csv')

In [9]:
reviews.head()


Out[9]:
Id ProductId UserId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Time Summary Text
0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian 1 1 5 1303862400 Good Quality Dog Food I have bought several of the Vitality canned d...
1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa 0 0 1 1346976000 Not as Advertised Product arrived labeled as Jumbo Salted Peanut...
2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres "Natalia Corres" 1 1 4 1219017600 "Delight" says it all This is a confection that has been around a fe...
3 4 B000UA0QIQ A395BORC6FGVXV Karl 3 3 2 1307923200 Cough Medicine If you are looking for the secret ingredient i...
4 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham "M. Wassir" 0 0 5 1350777600 Great taffy Great taffy at a great price. There was a wid...

In [10]:
reviews.dropna()
reviews = reviews.drop(['Id','ProductId','UserId','ProfileName','HelpfulnessNumerator','HelpfulnessDenominator',
                        'Score','Time'], 1)
reviews = reviews.reset_index(drop=True)

In [42]:
reviews.Text.isnull().sum()


Out[42]:
0

In [43]:
reviews = reviews[reviews.Summary.notnull()]

In [175]:
# Analysing some of the reviews
for i in range(712,714):
    print('Reviews #',i)
    print(reviews.Text[i])
    print(reviews.Summary[i])
    print('\n')


Reviews # 712
My husband (who, being Mexican, is very picky about his tortilla chips) and I absolutely love these!  The texture is light and crispy, rather than thick and crunchy. He actually usually prefers a very hearty, cruncy chip (Like El Ranchero), but the flavor of these is so fantastic that we're both thilled with them. The bean, rice and corn base makes them incredibly flavorful, and they have a touch of onion and garlic in addition to that. We go through an embarrassing amount of them.  I never, ever like plain chips, but these I can eat without anything else, although they're particularly amazing with a fresh salsa.  I highly recommend these!
Perfect tortilla chip goodness!


Reviews # 713
<a href="http://www.amazon.com/gp/product/B000GWLUGU">Plocky's Tortilla Chips, Red Beans 'N Rice, 7 Ounce Bag (Pack of 12)</a>  I first tasted these chips while visiting relatives in KY.  They are not available where I live, so I ordered them from Amazon.  WOW!  My friends and family are all addicted to them.  The spicy flavor grabs you at the first bite.  Once a bag is open, it is gone!
These chips are addictive!


Preprocessing of Dataset


In [45]:
# Some contraction to expansion
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [66]:
def clean_text(text,remove_stopwords=False):
    
    text = text.lower()
    clean_text = []
    for word in text.split():
        if word in contractions:
            clean_text.append(contractions[word])
        else:
            clean_text.append(word)
    text = " ".join(clean_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br', ' ', text)
    text = re.sub(r'/>', ' ', text)
    text = re.sub(r'>', ' ', text)
    text = re.sub(r'<', ' ', text)
    text = re.sub(r'`', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [182]:



Out[182]:
'<a href="http://www.amazon.com/gp/product/B000GWLUGU">Plocky\'s Tortilla Chips, Red Beans \'N Rice, 7 Ounce Bag (Pack of 12)</a>  I first tasted these chips while visiting relatives in KY.  They are not available where I live, so I ordered them from Amazon.  WOW!  My friends and family are all addicted to them.  The spicy flavor grabs you at the first bite.  Once a bag is open, it is gone!'

In [185]:
clean_text(str(reviews.Text[713]))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-185-4f93a75b8050> in <module>()
----> 1 clean_text(str(reviews.Text[713]))

TypeError: 'list' object is not callable

In [68]:
clean_summary = [clean_text(summary) for summary in reviews.Summary]
clean_text = [clean_text(text) for text in reviews.Text]

Build Vocabulary


In [84]:
def build_vocabulary(texts,summarys):
    tokens = []
    for text in texts:
        tokens.extend(text.split())
    for summary in summarys:
        tokens.extend(summary.split())
    return Counter(tokens)

In [85]:
vocab = build_vocabulary(clean_text,clean_summary)

In [86]:
print("Size of Vocabulary:", len(vocab))


Size of Vocabulary: 126931

Embedding

Using pre-trained Conceptnet Numberbatch's Embeddings (https://github.com/commonsense/conceptnet-numberbatch)


In [76]:
embed_dim = 300
embeddings = {}
with open('embeddings/numberbatch-en-17.06.txt',encoding='utf-8') as em:
    for embed in em:
        em_line = embed.split(' ')
        word = em_line[0]
        embedding = np.array(em_line[1:])
        embeddings[word] = embedding
print('Word embeddings:', len(embeddings))


Word embeddings: 417195

In [79]:
# Count no. of words not in Embeddings
threshold = 20 # Discard the words appearing less then 20 times.
missing_word_count = 0

for word in vocab:
    if vocab[word] >= 20 and word not in embeddings:
        missing_word_count += 1

print("Missing word count : ", missing_word_count)


Missing word count :  3609

In [87]:
# remove words having count less than threshold
new_vocab = {word:count for word,count in vocab.items() if count >= threshold or word in embeddings}

print("Size of New Vocabulary:", len(new_vocab))
print("Percent of actual words used : ",(len(new_vocab)/len(vocab) * 100))


Size of New Vocabulary: 59213
Percent of actual words used :  46.64975459107704

In [88]:
vocab_to_int = {}
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]
for i,code in enumerate(codes):
    vocab_to_int[code] = i

for i,word in enumerate(new_vocab.keys(),4):
    vocab_to_int[word] = i

In [94]:
int_to_vocab = {i:word for word,i in vocab_to_int.items()}

In [101]:
def convert_text_int(texts):
    int_list = []
    for text in texts:
        word_ints = []
        for word in text.split():
            if word in vocab_to_int:
                word_ints.append(vocab_to_int[word])
            else:
                word_ints.append(vocab_to_int["<UNK>"])
        int_list.append(word_ints)
    return int_list

In [103]:
summary_int = convert_text_int(clean_summary)
print(clean_summary[2])
print(summary_int[2])


 delight  says it all
[48751, 55192, 7566, 34070]

In [174]:
print(clean_summary[713])
print(summary_int[713])


these chips are addictive 
[50906, 16491, 4098, 56250]

In [108]:
text_int = convert_text_int(clean_text)

In [110]:
print(clean_text[201])
print(text_int[201])


i love  and use the empty containers for medicine advil in my purse  desk  suitcase  also line with felt to keep earrings  perfect little size to disguise small valuables when traveling and love the mints  tiny and powerful without burning my mouth 
[43923, 9028, 25752, 51565, 37515, 46647, 18540, 40405, 58764, 40248, 2555, 31965, 22875, 49243, 10690, 16755, 51368, 45174, 23204, 7940, 54857, 54746, 27269, 8711, 20535, 7940, 17739, 23437, 17499, 40305, 50181, 25752, 9028, 37515, 16415, 24552, 25752, 31476, 18910, 27745, 31965, 8130]

In [120]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings:
        word_embedding_matrix[i] = embeddings[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))


59217

Model Inputs


In [111]:
def model_inputs():
    input_ = tf.placeholder(dtype=tf.int32,shape=(None,None),name="input")
    target = tf.placeholder(dtype=tf.int32,shape=(None,None),name = "target")
    keep_prob = tf.placeholder(dtype=tf.float32,name="keep_prob")
    learning_rate = tf.placeholder(dtype=tf.float32,name="learning_rate")
    
    #for encoder decoder
    source_sequence_length = tf.placeholder(dtype=tf.int32,shape=(None,),name="source_sequence_length")
    target_sequence_length = tf.placeholder(dtype=tf.int32,shape=(None,),name="target_sequence_length")
    max_target_length = tf.reduce_max(target_sequence_length,name="max_target_length")
    return input_,target,keep_prob,learning_rate,source_sequence_length,target_sequence_length,max_target_length

In [112]:
#Process decoder input
def process_decoder_input(target_data,vocab_to_int,batch_size):
    
    strided_target = tf.strided_slice(target_data,(0,0),(batch_size,-1),(1,1))
    go = tf.fill(value=vocab_to_int["<GO>"],dims=(batch_size,1))
    decoder_input = tf.concat((go,strided_target),axis=1)
    return decoder_input

Creating LSTM cells


In [113]:
# Create LSTM cells
def get_lstm(rnn_size,keep_prob=0.7):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    drop = tf.contrib.rnn.DropoutWrapper(lstm,input_keep_prob=keep_prob)
    return drop

Encoding Layer


In [114]:
def encoding_layer(embeded_rnn_input,rnn_size,keep_prob,num_layers,batch_size,source_sequence_length):
#     forward lstm layer
    cell_fw = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size,keepProb) for _ in range(num_layers)])
    cell_fw.zero_state(batch_size)
#     backward lstm layer
    cell_bw = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size,keepProb) for _ in range(num_layers)])
    cell_bw.zero_state(batch_size)
    
    output,output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw,cell_bw=cell_bw,inputs=embeded_rnn_input,
                                    sequence_length=source_sequence_length)
    
    output=tf.concat(output,axis=2)
    return output,output_states

Decoding Layer

Training Decoder


In [121]:
def training_decoder(embeded_rnn_input,target_sequence_length,decoder_cell,encoder_state,
                     output_layer,max_target_length):
    helper = tf.contrib.seq2seq.TrainingHelper(embeded_rnn_input,target_sequence_length)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state=encoder_state,
                                              output_layer=output_layer)
    final_outputs, final_state = tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,
                                                     maximum_iterations=max_target_length)
    return final_outputs

Inference Decoder


In [124]:
def inference_decoder(embed_rnn_input,embedding_rnn_input,target_sequence_length,decoder_cell,encoder_state,
                     output_layer,max_target_length,batch_size):
    
    start_tokens = tf.tile(tf.constant(dtype=tf.int32,value=[vocab_to_int["<GO>"]]),multiples=[batch_size],name="start_tokens")
    
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_rnn_input,
                                                      start_tokens=start_tokens,
                                                      end_token=vocab_to_int["<EOS>"])
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,helper,initial_state=encoder_state,
                                              output_layer=output_layer)
    final_output, final_state = tf.contrib.seq2seq.dynamic_decode(decoder,impute_finished=True,
                                                     maximum_iterations=max_target_length)
    return final_output

Building Decoder


In [125]:
def decoding_layer(target_inputs,target_sequence_length,encoder_state,max_target_length,batch_size,num_layers,
                   rnn_size):
    
    vocab_len = len(vocab_to_int)
    lstm_cell = tf.contrib.rnn.MultiRNNCell([get_lstm(rnn_size) for _ in range(num_layers)])
    output_layer = tf.layers.dense(vocab_len,kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

    embedding = word_embedding_matrix
    embed = tf.nn.embedding_lookup(embedding,target_inputs)
    
    with tf.variable_scope("decoding"):
        train_decode_output = training_decoder(embed,target_sequence_length,lstm_cell,
                                                                  encoder_state,output_layer,max_target_length)
        
    with tf.variable_scope("decoding",reuse=True):
        infer_decode_output = inference_decoder(embedding,target_sequence_length,lstm_cell,encoder_state,
                                                output_layer,max_target_length,batch_size)
    
    
    return train_decode_output,infer_decode_output

Seq2Seq Modeling


In [127]:
def seq2seq_model(source_input,target_input,rnn_size,keep_prob,num_layers,batch_size,source_sequence_length,
                  target_sequence_length,max_target_length):
    
    embedding = word_embedding_matrix
    input_embed = tf.nn.embedding_lookup(embedding,source_input) 
    
    encoder_output,encoder_states = encoding_layer(input_embed,rnn_size,keep_prob,num_layers,batch_size,
                                                   source_sequence_length)
    
    output_embed = tf.nn.embedding_lookup(embedding,target_input)
    
    decoder_train_output, decoder_infer_output = decoding_layer(target_input, target_sequence_length, 
                                                                encoder_state, max_target_length, batch_size, 
                                                                num_layers, rnn_size)
    return decoder_train_output, decoder_infer_output

Batching


In [131]:
# Padding batches
def pad_sentence_batch(sentence_batch):
    max_length = max([len(sent) for sent in sentence_batch])
    print(max_length)
    padded_sentences = []
    for sent in sentence_batch:
        sent_len = len(sent)
        if len(sent) < max_length:
            padded_sentences.append(sent + [vocab_to_int["<PAD>"] for _ in range(max_length - sent_len)])
        else:
            padded_sentences.append(sent)
    return padded_sentences

In [132]:
sent= [[43923, 9028, 25752, 51565, 37515, 46647, 18540],
      [43923, 9028, 25752, 51565, 37515, 46647, 18540, 9028, 25752, 51565, 37515]]

print(pad_sentence_batch(sent))


11
[[43923, 9028, 25752, 51565, 37515, 46647, 18540, 1, 1, 1, 1], [43923, 9028, 25752, 51565, 37515, 46647, 18540, 9028, 25752, 51565, 37515]]

In [151]:
# Creating Batches
from operator import itemgetter

#sort the text from smallest to longest text
text_len_sorted= [(i,len(text)) for i,text in enumerate(text_int)]

In [156]:
len(text_int[494051])


Out[156]:
853

In [163]:
sorted_text_len = sorted(text_len_sorted,key= lambda x:x[1])

In [164]:
sorted_text_int = [text_int[i] for i,j in sorted_text_len]

In [173]:
clean_summary[713]


Out[173]:
'these chips are addictive '

In [169]:
sorted_text_len


Out[169]:
[(713, 0),
 (835, 0),
 (1059, 0),
 (1345, 0),
 (2032, 0),
 (2304, 0),
 (2508, 0),
 (2663, 0),
 (2680, 0),
 (2781, 0),
 (2888, 0),
 (3888, 0),
 (4466, 0),
 (4627, 0),
 (4696, 0),
 (5011, 0),
 (5031, 0),
 (5615, 0),
 (5845, 0),
 (6339, 0),
 (7025, 0),
 (7164, 0),
 (7336, 0),
 (7461, 0),
 (7507, 0),
 (10093, 0),
 (10494, 0),
 (10534, 0),
 (10762, 0),
 (11060, 0),
 (12216, 0),
 (12258, 0),
 (12461, 0),
 (13200, 0),
 (14660, 0),
 (14868, 0),
 (14919, 0),
 (15118, 0),
 (15691, 0),
 (15955, 0),
 (16418, 0),
 (16775, 0),
 (16852, 0),
 (16918, 0),
 (17341, 0),
 (18375, 0),
 (18525, 0),
 (20882, 0),
 (21012, 0),
 (21280, 0),
 (21470, 0),
 (21865, 0),
 (21882, 0),
 (22687, 0),
 (22701, 0),
 (23576, 0),
 (24569, 0),
 (24777, 0),
 (24780, 0),
 (24913, 0),
 (25576, 0),
 (25616, 0),
 (26309, 0),
 (27215, 0),
 (27727, 0),
 (28176, 0),
 (28479, 0),
 (28496, 0),
 (29036, 0),
 (29114, 0),
 (29536, 0),
 (29943, 0),
 (30020, 0),
 (30077, 0),
 (30079, 0),
 (30130, 0),
 (30197, 0),
 (30429, 0),
 (30520, 0),
 (31319, 0),
 (31974, 0),
 (32089, 0),
 (32227, 0),
 (32403, 0),
 (32445, 0),
 (32678, 0),
 (32789, 0),
 (32953, 0),
 (33099, 0),
 (34611, 0),
 (35130, 0),
 (35546, 0),
 (36003, 0),
 (37508, 0),
 (37640, 0),
 (37821, 0),
 (38045, 0),
 (38413, 0),
 (38905, 0),
 (39673, 0),
 (40136, 0),
 (40411, 0),
 (40832, 0),
 (40837, 0),
 (41043, 0),
 (41129, 0),
 (41499, 0),
 (41604, 0),
 (41646, 0),
 (42531, 0),
 (42616, 0),
 (43148, 0),
 (43918, 0),
 (43924, 0),
 (44972, 0),
 (45106, 0),
 (45110, 0),
 (45178, 0),
 (45478, 0),
 (45827, 0),
 (46101, 0),
 (46109, 0),
 (46637, 0),
 (46976, 0),
 (47217, 0),
 (47389, 0),
 (47912, 0),
 (48525, 0),
 (48688, 0),
 (49453, 0),
 (49568, 0),
 (49576, 0),
 (49733, 0),
 (50893, 0),
 (51074, 0),
 (51141, 0),
 (51793, 0),
 (52316, 0),
 (52899, 0),
 (53095, 0),
 (53159, 0),
 (53526, 0),
 (53729, 0),
 (54261, 0),
 (54350, 0),
 (54471, 0),
 (55099, 0),
 (55199, 0),
 (55212, 0),
 (55840, 0),
 (55842, 0),
 (56299, 0),
 (56760, 0),
 (56870, 0),
 (57233, 0),
 (57768, 0),
 (57800, 0),
 (57849, 0),
 (58248, 0),
 (58282, 0),
 (58562, 0),
 (58665, 0),
 (58796, 0),
 (59174, 0),
 (59785, 0),
 (59792, 0),
 (60107, 0),
 (60177, 0),
 (60552, 0),
 (60648, 0),
 (60664, 0),
 (61194, 0),
 (61240, 0),
 (61436, 0),
 (61444, 0),
 (61625, 0),
 (61633, 0),
 (61806, 0),
 (61932, 0),
 (62943, 0),
 (62995, 0),
 (63062, 0),
 (63176, 0),
 (63988, 0),
 (64732, 0),
 (64770, 0),
 (64847, 0),
 (65321, 0),
 (66198, 0),
 (67361, 0),
 (68161, 0),
 (68845, 0),
 (69119, 0),
 (69143, 0),
 (69433, 0),
 (69934, 0),
 (70521, 0),
 (70959, 0),
 (72234, 0),
 (73165, 0),
 (73729, 0),
 (74659, 0),
 (74927, 0),
 (75117, 0),
 (76463, 0),
 (77589, 0),
 (80004, 0),
 (80662, 0),
 (81081, 0),
 (82240, 0),
 (84172, 0),
 (85722, 0),
 (85753, 0),
 (85838, 0),
 (85960, 0),
 (85992, 0),
 (86003, 0),
 (86400, 0),
 (86543, 0),
 (86680, 0),
 (86830, 0),
 (87688, 0),
 (89767, 0),
 (91303, 0),
 (92380, 0),
 (93593, 0),
 (93784, 0),
 (94175, 0),
 (94269, 0),
 (94298, 0),
 (94798, 0),
 (95631, 0),
 (95682, 0),
 (95796, 0),
 (97139, 0),
 (97177, 0),
 (97653, 0),
 (97724, 0),
 (98494, 0),
 (98518, 0),
 (98769, 0),
 (98858, 0),
 (99533, 0),
 (99551, 0),
 (99784, 0),
 (100241, 0),
 (100767, 0),
 (101384, 0),
 (101729, 0),
 (101883, 0),
 (102489, 0),
 (103159, 0),
 (103202, 0),
 (103254, 0),
 (103647, 0),
 (103718, 0),
 (104147, 0),
 (104172, 0),
 (104269, 0),
 (104763, 0),
 (106015, 0),
 (106943, 0),
 (107241, 0),
 (107641, 0),
 (108346, 0),
 (108623, 0),
 (108642, 0),
 (108781, 0),
 (109377, 0),
 (109561, 0),
 (110107, 0),
 (110144, 0),
 (110553, 0),
 (110752, 0),
 (110927, 0),
 (111667, 0),
 (112164, 0),
 (112389, 0),
 (112464, 0),
 (113471, 0),
 (113533, 0),
 (113540, 0),
 (113549, 0),
 (113753, 0),
 (113804, 0),
 (114131, 0),
 (115252, 0),
 (115352, 0),
 (115404, 0),
 (115991, 0),
 (116195, 0),
 (116587, 0),
 (117904, 0),
 (118376, 0),
 (118455, 0),
 (118557, 0),
 (118561, 0),
 (118565, 0),
 (118762, 0),
 (118777, 0),
 (118938, 0),
 (118970, 0),
 (119292, 0),
 (119326, 0),
 (119354, 0),
 (120592, 0),
 (120710, 0),
 (120749, 0),
 (120778, 0),
 (121248, 0),
 (121556, 0),
 (122560, 0),
 (122675, 0),
 (122728, 0),
 (123010, 0),
 (123406, 0),
 (123568, 0),
 (123581, 0),
 (123666, 0),
 (123738, 0),
 (123926, 0),
 (124345, 0),
 (124761, 0),
 (125898, 0),
 (126017, 0),
 (126104, 0),
 (126255, 0),
 (126760, 0),
 (126796, 0),
 (128435, 0),
 (128616, 0),
 (128903, 0),
 (129534, 0),
 (129716, 0),
 (129999, 0),
 (130039, 0),
 (130727, 0),
 (131008, 0),
 (132506, 0),
 (133349, 0),
 (133933, 0),
 (134307, 0),
 (135645, 0),
 (136406, 0),
 (136505, 0),
 (136929, 0),
 (137084, 0),
 (137279, 0),
 (137529, 0),
 (137556, 0),
 (137701, 0),
 (137705, 0),
 (138518, 0),
 (139326, 0),
 (139341, 0),
 (139355, 0),
 (140023, 0),
 (140257, 0),
 (141183, 0),
 (141510, 0),
 (142062, 0),
 (143601, 0),
 (143714, 0),
 (143715, 0),
 (144213, 0),
 (144279, 0),
 (144405, 0),
 (144675, 0),
 (144854, 0),
 (145351, 0),
 (145355, 0),
 (145418, 0),
 (145435, 0),
 (145539, 0),
 (145745, 0),
 (145999, 0),
 (146572, 0),
 (146619, 0),
 (147424, 0),
 (147549, 0),
 (147655, 0),
 (148089, 0),
 (148185, 0),
 (148561, 0),
 (148573, 0),
 (149336, 0),
 (149393, 0),
 (149527, 0),
 (149711, 0),
 (150626, 0),
 (150678, 0),
 (151872, 0),
 (151875, 0),
 (152020, 0),
 (152245, 0),
 (152692, 0),
 (152890, 0),
 (152903, 0),
 (153071, 0),
 (153113, 0),
 (153459, 0),
 (153643, 0),
 (154131, 0),
 (154228, 0),
 (154470, 0),
 (154557, 0),
 (154615, 0),
 (156104, 0),
 (156333, 0),
 (156905, 0),
 (157385, 0),
 (157484, 0),
 (158012, 0),
 (159329, 0),
 (160279, 0),
 (160444, 0),
 (160550, 0),
 (160784, 0),
 (160788, 0),
 (161062, 0),
 (161150, 0),
 (161652, 0),
 (161973, 0),
 (162038, 0),
 (162182, 0),
 (162924, 0),
 (163278, 0),
 (163282, 0),
 (163674, 0),
 (164260, 0),
 (164271, 0),
 (164414, 0),
 (164904, 0),
 (165122, 0),
 (165897, 0),
 (165914, 0),
 (166107, 0),
 (166670, 0),
 (166931, 0),
 (167503, 0),
 (167796, 0),
 (168502, 0),
 (168508, 0),
 (168667, 0),
 (168818, 0),
 (168977, 0),
 (169148, 0),
 (169557, 0),
 (169678, 0),
 (170047, 0),
 (170592, 0),
 (170883, 0),
 (170950, 0),
 (171792, 0),
 (172347, 0),
 (172525, 0),
 (174791, 0),
 (174944, 0),
 (175147, 0),
 (176575, 0),
 (177001, 0),
 (178025, 0),
 (178688, 0),
 (178801, 0),
 (179220, 0),
 (179535, 0),
 (180703, 0),
 (181826, 0),
 (182058, 0),
 (184404, 0),
 (185050, 0),
 (186314, 0),
 (186534, 0),
 (187075, 0),
 (187340, 0),
 (188399, 0),
 (188620, 0),
 (188927, 0),
 (190439, 0),
 (191271, 0),
 (191426, 0),
 (191998, 0),
 (192287, 0),
 (192395, 0),
 (192501, 0),
 (194845, 0),
 (195606, 0),
 (195963, 0),
 (196022, 0),
 (197056, 0),
 (197431, 0),
 (197550, 0),
 (197796, 0),
 (198220, 0),
 (198978, 0),
 (199151, 0),
 (200214, 0),
 (200852, 0),
 (201045, 0),
 (201536, 0),
 (201951, 0),
 (201976, 0),
 (202432, 0),
 (202660, 0),
 (202668, 0),
 (203076, 0),
 (204255, 0),
 (204401, 0),
 (204440, 0),
 (204453, 0),
 (204609, 0),
 (204624, 0),
 (204892, 0),
 (204903, 0),
 (205060, 0),
 (205198, 0),
 (205498, 0),
 (205686, 0),
 (205903, 0),
 (206578, 0),
 (206887, 0),
 (206901, 0),
 (207206, 0),
 (207412, 0),
 (208337, 0),
 (208674, 0),
 (209204, 0),
 (209514, 0),
 (209715, 0),
 (209797, 0),
 (209911, 0),
 (209940, 0),
 (210185, 0),
 (210255, 0),
 (210611, 0),
 (210834, 0),
 (210984, 0),
 (211119, 0),
 (211413, 0),
 (211829, 0),
 (212305, 0),
 (212487, 0),
 (213152, 0),
 (213353, 0),
 (213785, 0),
 (214068, 0),
 (214181, 0),
 (215017, 0),
 (215379, 0),
 (215387, 0),
 (215855, 0),
 (216376, 0),
 (216540, 0),
 (217047, 0),
 (217080, 0),
 (217177, 0),
 (217553, 0),
 (218133, 0),
 (218674, 0),
 (218942, 0),
 (219057, 0),
 (219410, 0),
 (220207, 0),
 (220562, 0),
 (220873, 0),
 (220905, 0),
 (221293, 0),
 (221479, 0),
 (221672, 0),
 (221872, 0),
 (222312, 0),
 (222661, 0),
 (222973, 0),
 (223129, 0),
 (223464, 0),
 (223859, 0),
 (224363, 0),
 (224998, 0),
 (225446, 0),
 (225682, 0),
 (226110, 0),
 (227914, 0),
 (228486, 0),
 (229344, 0),
 (230357, 0),
 (230989, 0),
 (231190, 0),
 (231216, 0),
 (231620, 0),
 (231732, 0),
 (232365, 0),
 (232928, 0),
 (233062, 0),
 (233367, 0),
 (233586, 0),
 (234608, 0),
 (234625, 0),
 (235362, 0),
 (235977, 0),
 (236030, 0),
 (237478, 0),
 (237749, 0),
 (238053, 0),
 (238408, 0),
 (239393, 0),
 (239766, 0),
 (240488, 0),
 (240739, 0),
 (240837, 0),
 (240856, 0),
 (240918, 0),
 (240931, 0),
 (240938, 0),
 (241134, 0),
 (241315, 0),
 (241372, 0),
 (241612, 0),
 (242204, 0),
 (242667, 0),
 (242752, 0),
 (244189, 0),
 (244261, 0),
 (244555, 0),
 (244917, 0),
 (245326, 0),
 (245789, 0),
 (246164, 0),
 (246952, 0),
 (246991, 0),
 (247763, 0),
 (248374, 0),
 (248910, 0),
 (248914, 0),
 (250973, 0),
 (251554, 0),
 (251657, 0),
 (252188, 0),
 (252791, 0),
 (252991, 0),
 (253101, 0),
 (253283, 0),
 (253837, 0),
 (254090, 0),
 (254264, 0),
 (254343, 0),
 (255178, 0),
 (255482, 0),
 (257930, 0),
 (258103, 0),
 (258636, 0),
 (259448, 0),
 (259463, 0),
 (259870, 0),
 (260037, 0),
 (260143, 0),
 (260211, 0),
 (260358, 0),
 (260360, 0),
 (260469, 0),
 (261067, 0),
 (261599, 0),
 (261987, 0),
 (262044, 0),
 (262484, 0),
 (262935, 0),
 (263377, 0),
 (263474, 0),
 (263482, 0),
 (263815, 0),
 (263893, 0),
 (264673, 0),
 (265173, 0),
 (265220, 0),
 (265478, 0),
 (265709, 0),
 (266802, 0),
 (267126, 0),
 (267654, 0),
 (268629, 0),
 (269257, 0),
 (269401, 0),
 (270057, 0),
 (271035, 0),
 (271099, 0),
 (271203, 0),
 (271216, 0),
 (271785, 0),
 (271787, 0),
 (272347, 0),
 (272623, 0),
 (272862, 0),
 (272978, 0),
 (273084, 0),
 (273543, 0),
 (273949, 0),
 (274426, 0),
 (274526, 0),
 (275264, 0),
 (276170, 0),
 (276740, 0),
 (277029, 0),
 (278386, 0),
 (278817, 0),
 (279376, 0),
 (280577, 0),
 (280617, 0),
 (281300, 0),
 (281316, 0),
 (281385, 0),
 (281510, 0),
 (281887, 0),
 (281912, 0),
 (282104, 0),
 (282243, 0),
 (282443, 0),
 (282444, 0),
 (282699, 0),
 (282756, 0),
 (283556, 0),
 (284108, 0),
 (284972, 0),
 (285120, 0),
 (285958, 0),
 (286070, 0),
 (286254, 0),
 (286496, 0),
 (286958, 0),
 (288008, 0),
 (288138, 0),
 (288379, 0),
 (288403, 0),
 (288626, 0),
 (288826, 0),
 (289223, 0),
 (290164, 0),
 (290465, 0),
 (290657, 0),
 (292334, 0),
 (292434, 0),
 (292559, 0),
 (292626, 0),
 (292895, 0),
 (292940, 0),
 (293705, 0),
 (293765, 0),
 (294219, 0),
 (294518, 0),
 (294749, 0),
 (294775, 0),
 (295079, 0),
 (296068, 0),
 (296568, 0),
 (296581, 0),
 (296939, 0),
 (297097, 0),
 (297587, 0),
 (297629, 0),
 (298108, 0),
 (298271, 0),
 (298406, 0),
 (298512, 0),
 (298878, 0),
 (299371, 0),
 (299876, 0),
 (300082, 0),
 (300348, 0),
 (300712, 0),
 (301201, 0),
 (302054, 0),
 (302361, 0),
 (302475, 0),
 (302543, 0),
 (302661, 0),
 (302694, 0),
 (302728, 0),
 (303965, 0),
 (304238, 0),
 (304352, 0),
 (304368, 0),
 (305354, 0),
 (305561, 0),
 (305663, 0),
 (306317, 0),
 (306706, 0),
 (306831, 0),
 (306835, 0),
 (306958, 0),
 (307272, 0),
 (307520, 0),
 (308294, 0),
 (308299, 0),
 (308685, 0),
 (308725, 0),
 (309584, 0),
 (309861, 0),
 (309956, 0),
 (310168, 0),
 (310179, 0),
 (310745, 0),
 (311598, 0),
 (311680, 0),
 (311892, 0),
 (312014, 0),
 (312175, 0),
 (313219, 0),
 (314439, 0),
 (314447, 0),
 (314895, 0),
 (315049, 0),
 (315141, 0),
 (315511, 0),
 (315553, 0),
 (315860, 0),
 (316044, 0),
 (316128, 0),
 (316256, 0),
 (316463, 0),
 (316865, 0),
 (317299, 0),
 (317577, 0),
 (317653, 0),
 (318475, 0),
 (318476, 0),
 (319047, 0),
 (319233, 0),
 (319334, 0),
 (319596, 0),
 (319987, 0),
 (320185, 0),
 (320405, 0),
 (320735, 0),
 (321386, 0),
 (321563, 0),
 (321797, 0),
 (322645, 0),
 (322883, 0),
 (322933, 0),
 (323198, 0),
 (324689, 0),
 (324781, 0),
 (324783, 0),
 (326694, 0),
 (327677, 0),
 (327685, 0),
 (328376, 0),
 (329429, 0),
 (329697, 0),
 (329887, 0),
 (330454, 0),
 (330511, 0),
 (330562, 0),
 (330937, 0),
 (331050, 0),
 (331132, 0),
 (331567, 0),
 (331619, 0),
 (332513, 0),
 (332658, 0),
 (334057, 0),
 (334122, 0),
 (334132, 0),
 (334282, 0),
 (335024, 0),
 (335230, 0),
 (335588, 0),
 (336042, 0),
 (336539, 0),
 (336871, 0),
 (337216, 0),
 (337383, 0),
 (337556, 0),
 (337897, 0),
 (338039, 0),
 (338957, 0),
 (338996, 0),
 (339116, 0),
 (340435, 0),
 (341174, 0),
 (341902, 0),
 (341976, 0),
 (342002, 0),
 (342246, 0),
 (342260, 0),
 (342672, 0),
 (343548, 0),
 (343861, 0),
 (344035, 0),
 (344043, 0),
 (345358, 0),
 (345443, 0),
 (345822, 0),
 (346327, 0),
 (346371, 0),
 (346444, 0),
 (346446, 0),
 (346460, 0),
 (346463, 0),
 (346465, 0),
 (346470, 0),
 (346580, 0),
 (347141, 0),
 (347149, 0),
 (347388, 0),
 (347767, 0),
 (348072, 0),
 (348239, 0),
 (348750, 0),
 (348891, 0),
 (349802, 0),
 (350159, 0),
 (351225, 0),
 (351272, 0),
 (351671, 0),
 (351739, 0),
 (351844, 0),
 (352397, 0),
 (352733, 0),
 (355175, 0),
 (355518, 0),
 (355786, 0),
 (355976, 0),
 (356393, 0),
 (357345, 0),
 (357931, 0),
 (358051, 0),
 (358648, 0),
 (359152, 0),
 (359245, 0),
 (360278, 0),
 (361269, 0),
 (361280, 0),
 (361494, 0),
 (362110, 0),
 (362644, 0),
 (362741, 0),
 (363146, 0),
 (363867, 0),
 (364234, 0),
 (364726, 0),
 (365058, 0),
 (365252, 0),
 (365761, 0),
 (366574, 0),
 (367335, 0),
 (368301, 0),
 (369593, 0),
 (369996, 0),
 (370516, 0),
 (370526, 0),
 (370701, 0),
 (371056, 0),
 (371411, 0),
 (371485, 0),
 (371575, 0),
 (371641, 0),
 (372374, 0),
 (372777, 0),
 (372815, 0),
 (373508, 0),
 (373970, 0),
 (374084, 0),
 (374422, 0),
 (374491, 0),
 (374667, 0),
 (375254, 0),
 (375283, 0),
 (375699, 0),
 (376345, 0),
 (376606, 0),
 (376915, 0),
 (377042, 0),
 (378475, 0),
 (378923, 0),
 (379085, 0),
 (379146, 0),
 (380025, 0),
 (380351, 0),
 (380671, 0),
 (381532, 0),
 (381804, 0),
 (381941, 0),
 (382545, 0),
 (382946, 0),
 (383138, 0),
 (383616, 0),
 (384444, 0),
 (384489, 0),
 (384843, 0),
 (384881, 0),
 (386078, 0),
 (386535, 0),
 (386556, 0),
 (386932, 0),
 (387013, 0),
 (387741, 0),
 (388316, 0),
 (389302, 0),
 (389648, 0),
 (390119, 0),
 (391212, 0),
 ...]

Hyperparameter


In [128]:
# Number of Epochs
epochs = 3
# Batch Size
batch_size = 250
# RNN Size
rnn_size = 100
# Number of Layers
num_layers = 3
# Embedding Size
encoding_embedding_size = 200
decoding_embedding_size = 200
# Learning Rate
learning_rate = 0.01
# Dropout Keep Probability
keep_probability = 0.8
display_step = 5

Building Graph


In [ ]:
save_path = 'checkpoints/dev'

source_int_text = text_int
target_int_text = summary_int

train_graph = tf.Graph()
with train_graph.as_Default():
    
    input_,target,keep_prob,learning_rate,source_sequence_length,target_sequence_length,max_target_length = model_inputs()
    seq2seq_graph = seq2seq_model(input_,target,rnn_size,keep_probability,num_layers,batch_size,
                                  source_sequence_length,max_target_length)